{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SELECT from WORLD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "import findspark\n",
    "import pandas as pd\n",
    "findspark.init()\n",
    "\n",
    "SVR = '192.168.31.31'\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "sc = (SparkSession.builder.appName('app02') \n",
    "      .master(f'spark://{SVR}:7077') \n",
    "      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') \n",
    "      .config('spark.cores.max', '4') \n",
    "      .config('spark.executor.instances', '1') \n",
    "      .config('spark.executor.cores', '2') \n",
    "      .config('spark.executor.memory', '10g') \n",
    "      .enableHiveSupport().getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "world = sc.read.table('sqlzoo.world')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Introduction\n",
    "\n",
    "[Read the notes about this table](https://sqlzoo.net/wiki/Read_the_notes_about_this_table.). Observe the result of running this SQL command to show the name, continent and population of all countries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>continent</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Asia</td>\n",
       "      <td>32225560.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>Europe</td>\n",
       "      <td>2845955.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>Africa</td>\n",
       "      <td>43000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>Europe</td>\n",
       "      <td>77543.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>Africa</td>\n",
       "      <td>31127674.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>190</th>\n",
       "      <td>Venezuela</td>\n",
       "      <td>South America</td>\n",
       "      <td>32219521.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>Vietnam</td>\n",
       "      <td>Asia</td>\n",
       "      <td>96208984.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>Yemen</td>\n",
       "      <td>Asia</td>\n",
       "      <td>29825968.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>Zambia</td>\n",
       "      <td>Africa</td>\n",
       "      <td>17885422.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>194</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Africa</td>\n",
       "      <td>15159624.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>195 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            name      continent  population\n",
       "0    Afghanistan           Asia  32225560.0\n",
       "1        Albania         Europe   2845955.0\n",
       "2        Algeria         Africa  43000000.0\n",
       "3        Andorra         Europe     77543.0\n",
       "4         Angola         Africa  31127674.0\n",
       "..           ...            ...         ...\n",
       "190    Venezuela  South America  32219521.0\n",
       "191      Vietnam           Asia  96208984.0\n",
       "192        Yemen           Asia  29825968.0\n",
       "193       Zambia         Africa  17885422.0\n",
       "194     Zimbabwe         Africa  15159624.0\n",
       "\n",
       "[195 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "world.select('name', 'continent', 'population').toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Large Countries\n",
    "\n",
    "[How to use WHERE to filter records](https://sqlzoo.net/wiki/WHERE_filters). Show the name for the countries that have a population of at least 200 million. 200 million is 200000000, there are eight zeros."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brazil</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>China</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Indonesia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nigeria</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Pakistan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            name\n",
       "0         Brazil\n",
       "1          China\n",
       "2          India\n",
       "3      Indonesia\n",
       "4        Nigeria\n",
       "5       Pakistan\n",
       "6  United States"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "world.filter(world['population']>=2e8).select('name').toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Per capita GDP\n",
    "\n",
    "Give the `name` and the **per capita GDP** for those countries with a `population` of at least 200 million.\n",
    "\n",
    "> _HELP:How to calculate per capita GDP_   \n",
    "> per capita GDP is the GDP divided by the population GDP/population"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>pcgdp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>9721.370041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>China</td>\n",
       "      <td>8724.306440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>India</td>\n",
       "      <td>1891.781051</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Indonesia</td>\n",
       "      <td>3804.772286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nigeria</td>\n",
       "      <td>1822.886159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Pakistan</td>\n",
       "      <td>1377.036279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>United States</td>\n",
       "      <td>59121.192067</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            name         pcgdp\n",
       "0         Brazil   9721.370041\n",
       "1          China   8724.306440\n",
       "2          India   1891.781051\n",
       "3      Indonesia   3804.772286\n",
       "4        Nigeria   1822.886159\n",
       "5       Pakistan   1377.036279\n",
       "6  United States  59121.192067"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.withColumn('pcgdp', world['gdp']/world['population'])\n",
    "    .filter(world['population']>=2e8)\n",
    "    .select('name', 'pcgdp')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. South America In millions\n",
    "\n",
    "Show the `name` and `population` in millions for the countries of the `continent` 'South America'. Divide the population by 1000000 to get population in millions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>popl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Argentina</td>\n",
       "      <td>44.938712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bolivia</td>\n",
       "      <td>11.469896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>211.442625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chile</td>\n",
       "      <td>19.107216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Colombia</td>\n",
       "      <td>49.395678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Ecuador</td>\n",
       "      <td>17.472948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Guyana</td>\n",
       "      <td>0.782766</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Paraguay</td>\n",
       "      <td>7.252672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Peru</td>\n",
       "      <td>32.131400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Saint Vincent and the Grenadines</td>\n",
       "      <td>0.110608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Suriname</td>\n",
       "      <td>0.581372</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Uruguay</td>\n",
       "      <td>3.518552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Venezuela</td>\n",
       "      <td>32.219521</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                name        popl\n",
       "0                          Argentina   44.938712\n",
       "1                            Bolivia   11.469896\n",
       "2                             Brazil  211.442625\n",
       "3                              Chile   19.107216\n",
       "4                           Colombia   49.395678\n",
       "5                            Ecuador   17.472948\n",
       "6                             Guyana    0.782766\n",
       "7                           Paraguay    7.252672\n",
       "8                               Peru   32.131400\n",
       "9   Saint Vincent and the Grenadines    0.110608\n",
       "10                          Suriname    0.581372\n",
       "11                           Uruguay    3.518552\n",
       "12                         Venezuela   32.219521"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.withColumn('popl', world['population']/1e6)\n",
    "    .filter(world['continent']=='South America')\n",
    "    .select('name', 'popl')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. France, Germany, Italy\n",
    "\n",
    "Show the `name` and `population` for France, Germany, Italy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>France</td>\n",
       "      <td>67076000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Germany</td>\n",
       "      <td>83149300.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Italy</td>\n",
       "      <td>60238522.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      name  population\n",
       "0   France  67076000.0\n",
       "1  Germany  83149300.0\n",
       "2    Italy  60238522.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.filter(world['name'].isin(['France', 'Germany', 'Italy']))\n",
    "     .select('name', 'population')\n",
    "     .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. United\n",
    "\n",
    "Show the countries which have a `name` that includes the word 'United'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>United Arab Emirates</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   name\n",
       "0  United Arab Emirates\n",
       "1        United Kingdom\n",
       "2         United States"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "world.filter(world['name'].contains('United')).select('name').toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Two ways to be big\n",
    "\n",
    "Two ways to be big: A country is **big** if it has an area of more than 3 million sq km or it has a population of more than 250 million.\n",
    "\n",
    "**Show the countries that are big by area or big by population. Show name, population and area.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>population</th>\n",
       "      <th>area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Australia</td>\n",
       "      <td>2.569002e+07</td>\n",
       "      <td>7692024.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>2.114426e+08</td>\n",
       "      <td>8515767.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Canada</td>\n",
       "      <td>3.800717e+07</td>\n",
       "      <td>9984670.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>China</td>\n",
       "      <td>1.402379e+09</td>\n",
       "      <td>9596961.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>India</td>\n",
       "      <td>1.361503e+09</td>\n",
       "      <td>3166414.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Indonesia</td>\n",
       "      <td>2.669119e+08</td>\n",
       "      <td>1904569.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Russia</td>\n",
       "      <td>1.467451e+08</td>\n",
       "      <td>17125242.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>United States</td>\n",
       "      <td>3.295839e+08</td>\n",
       "      <td>9826675.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            name    population        area\n",
       "0      Australia  2.569002e+07   7692024.0\n",
       "1         Brazil  2.114426e+08   8515767.0\n",
       "2         Canada  3.800717e+07   9984670.0\n",
       "3          China  1.402379e+09   9596961.0\n",
       "4          India  1.361503e+09   3166414.0\n",
       "5      Indonesia  2.669119e+08   1904569.0\n",
       "6         Russia  1.467451e+08  17125242.0\n",
       "7  United States  3.295839e+08   9826675.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.filter((world['area']>3e6) | (world['population']>2.5e8))\n",
    "    .select('name', 'population', 'area')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. One or the other (but not both)\n",
    "\n",
    "**Exclusive OR (XOR). Show the countries that are big by area (more than 3 million) or big by population (more than 250 million) but not both. Show name, population and area.**\n",
    "\n",
    "- Australia has a big area but a small population, it should be **included**.\n",
    "- Indonesia has a big population but a small area, it should be **included**.\n",
    "- China has a big population **and** big area, it should be **excluded**.\n",
    "- United Kingdom has a small population and a small area, it should be **excluded**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>population</th>\n",
       "      <th>area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Australia</td>\n",
       "      <td>25690023.0</td>\n",
       "      <td>7692024.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>211442625.0</td>\n",
       "      <td>8515767.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Canada</td>\n",
       "      <td>38007166.0</td>\n",
       "      <td>9984670.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Indonesia</td>\n",
       "      <td>266911900.0</td>\n",
       "      <td>1904569.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Russia</td>\n",
       "      <td>146745098.0</td>\n",
       "      <td>17125242.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        name   population        area\n",
       "0  Australia   25690023.0   7692024.0\n",
       "1     Brazil  211442625.0   8515767.0\n",
       "2     Canada   38007166.0   9984670.0\n",
       "3  Indonesia  266911900.0   1904569.0\n",
       "4     Russia  146745098.0  17125242.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.filter((world['area']>3e6) != (world['population']>2.5e8))\n",
    "     .select('name', 'population', 'area')\n",
    "     .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Rounding\n",
    "\n",
    "Show the `name` and `population` in millions and the GDP in billions for the countries of the `continent` 'South America'. Use the [ROUND](https://sqlzoo.net/wiki/ROUND) function to show the values to two decimal places.\n",
    "\n",
    "**For South America show population in millions and GDP in billions both to 2 decimal places.**\n",
    "\n",
    "> _Millions and billions_    \n",
    "> Divide by 1000000 (6 zeros) for millions. Divide by 1000000000 (9 zeros) for billions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>popl</th>\n",
       "      <th>gdp_</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Argentina</td>\n",
       "      <td>44.94</td>\n",
       "      <td>637.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bolivia</td>\n",
       "      <td>11.47</td>\n",
       "      <td>37.51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>211.44</td>\n",
       "      <td>2055.51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chile</td>\n",
       "      <td>19.11</td>\n",
       "      <td>277.08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Colombia</td>\n",
       "      <td>49.40</td>\n",
       "      <td>309.19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Ecuador</td>\n",
       "      <td>17.47</td>\n",
       "      <td>104.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Guyana</td>\n",
       "      <td>0.78</td>\n",
       "      <td>3.09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Paraguay</td>\n",
       "      <td>7.25</td>\n",
       "      <td>29.44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Peru</td>\n",
       "      <td>32.13</td>\n",
       "      <td>211.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Saint Vincent and the Grenadines</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Suriname</td>\n",
       "      <td>0.58</td>\n",
       "      <td>5.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Uruguay</td>\n",
       "      <td>3.52</td>\n",
       "      <td>59.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Venezuela</td>\n",
       "      <td>32.22</td>\n",
       "      <td>255.09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                name    popl     gdp_\n",
       "0                          Argentina   44.94   637.49\n",
       "1                            Bolivia   11.47    37.51\n",
       "2                             Brazil  211.44  2055.51\n",
       "3                              Chile   19.11   277.08\n",
       "4                           Colombia   49.40   309.19\n",
       "5                            Ecuador   17.47   104.30\n",
       "6                             Guyana    0.78     3.09\n",
       "7                           Paraguay    7.25    29.44\n",
       "8                               Peru   32.13   211.40\n",
       "9   Saint Vincent and the Grenadines    0.11     0.73\n",
       "10                          Suriname    0.58     5.21\n",
       "11                           Uruguay    3.52    59.18\n",
       "12                         Venezuela   32.22   255.09"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import round\n",
    "(world.filter(world['continent']=='South America')\n",
    "    .withColumn('popl', round(world['population']/1e6, 2))\n",
    "    .withColumn('gdp_', round(world['gdp']/1e9, 2))\n",
    "    .select('name', 'popl', 'gdp_')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Trillion dollar economies\n",
    "\n",
    "Show the `name` and per-capita GDP for those countries with a GDP of at least one trillion (1000000000000; that is 12 zeros). Round this value to the nearest 1000.\n",
    "\n",
    "**Show per-capita GDP for the trillion dollar countries to the nearest $1000.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>pcgdp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Australia</td>\n",
       "      <td>55000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Canada</td>\n",
       "      <td>43000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>China</td>\n",
       "      <td>9000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>France</td>\n",
       "      <td>39000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Germany</td>\n",
       "      <td>44000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>India</td>\n",
       "      <td>2000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Indonesia</td>\n",
       "      <td>4000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Italy</td>\n",
       "      <td>32000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Japan</td>\n",
       "      <td>39000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Mexico</td>\n",
       "      <td>9000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Russia</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>South Korea</td>\n",
       "      <td>22000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Spain</td>\n",
       "      <td>28000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>40000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>United States</td>\n",
       "      <td>59000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              name    pcgdp\n",
       "0        Australia  55000.0\n",
       "1           Brazil  10000.0\n",
       "2           Canada  43000.0\n",
       "3            China   9000.0\n",
       "4           France  39000.0\n",
       "5          Germany  44000.0\n",
       "6            India   2000.0\n",
       "7        Indonesia   4000.0\n",
       "8            Italy  32000.0\n",
       "9            Japan  39000.0\n",
       "10          Mexico   9000.0\n",
       "11          Russia  10000.0\n",
       "12     South Korea  22000.0\n",
       "13           Spain  28000.0\n",
       "14  United Kingdom  40000.0\n",
       "15   United States  59000.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.withColumn('pcgdp', round(world['gdp']/(1000*world['population']), 0)*1000)\n",
    "    .filter(world['gdp']>1e12)\n",
    "    .select('name', 'pcgdp')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11. Name and capital have the same length\n",
    "\n",
    "Greece has capital Athens.\n",
    "\n",
    "Each of the strings 'Greece', and 'Athens' has 6 characters.\n",
    "\n",
    "**Show the name and capital where the name and the capital have the same number of characters.**\n",
    "\n",
    "- You can use the [LENGTH](https://sqlzoo.net/wiki/LENGTH) function to find the number of characters in a string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>capital</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>Algiers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Angola</td>\n",
       "      <td>Luanda</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Armenia</td>\n",
       "      <td>Yerevan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Botswana</td>\n",
       "      <td>Gaborone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Canada</td>\n",
       "      <td>Ottowa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Djibouti</td>\n",
       "      <td>Djibouti</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Egypt</td>\n",
       "      <td>Cairo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Estonia</td>\n",
       "      <td>Tallinn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Fiji</td>\n",
       "      <td>Suva</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Gambia</td>\n",
       "      <td>Banjul</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Georgia</td>\n",
       "      <td>Tbilisi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Ghana</td>\n",
       "      <td>Accra</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Greece</td>\n",
       "      <td>Athens</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Luxembourg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Mauritania</td>\n",
       "      <td>Nouakchott</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Paraguay</td>\n",
       "      <td>Asunción</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Peru</td>\n",
       "      <td>Lima</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Poland</td>\n",
       "      <td>Warsaw</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Russia</td>\n",
       "      <td>Moscow</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Rwanda</td>\n",
       "      <td>Kigali</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>San Marino</td>\n",
       "      <td>San Marino</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Singapore</td>\n",
       "      <td>Singapore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Taiwan</td>\n",
       "      <td>Taipei</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Togo</td>\n",
       "      <td>Lomé</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Turkey</td>\n",
       "      <td>Ankara</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Zambia</td>\n",
       "      <td>Lusaka</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          name     capital\n",
       "0      Algeria     Algiers\n",
       "1       Angola      Luanda\n",
       "2      Armenia     Yerevan\n",
       "3     Botswana    Gaborone\n",
       "4       Canada      Ottowa\n",
       "5     Djibouti    Djibouti\n",
       "6        Egypt       Cairo\n",
       "7      Estonia     Tallinn\n",
       "8         Fiji        Suva\n",
       "9       Gambia      Banjul\n",
       "10     Georgia     Tbilisi\n",
       "11       Ghana       Accra\n",
       "12      Greece      Athens\n",
       "13  Luxembourg  Luxembourg\n",
       "14  Mauritania  Nouakchott\n",
       "15    Paraguay    Asunción\n",
       "16        Peru        Lima\n",
       "17      Poland      Warsaw\n",
       "18      Russia      Moscow\n",
       "19      Rwanda      Kigali\n",
       "20  San Marino  San Marino\n",
       "21   Singapore   Singapore\n",
       "22      Taiwan      Taipei\n",
       "23        Togo        Lomé\n",
       "24      Turkey      Ankara\n",
       "25      Zambia      Lusaka"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import length\n",
    "(world.filter(length(world['name'])==length(world['capital']))\n",
    "    .select('name', 'capital')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12. Matching name and capital\n",
    "\n",
    "The capital of Sweden is Stockholm. Both words start with the letter 'S'.\n",
    "\n",
    "**Show the name and the capital where the first letters of each match. Don't include countries where the name and the capital are the same word.**\n",
    "\n",
    "- You can use the function [LEFT](https://sqlzoo.net/wiki/LEFT) to isolate the first character.\n",
    "- You can use <> as the **NOT EQUALS** operator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>capital</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>Algiers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>Andorra la Vella</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Barbados</td>\n",
       "      <td>Bridgetown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Belize</td>\n",
       "      <td>Belmopan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasília</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Brunei</td>\n",
       "      <td>Bandar Seri Begawan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Burundi</td>\n",
       "      <td>Bujumbura</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Djibouti</td>\n",
       "      <td>Djibouti</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Guatemala</td>\n",
       "      <td>Guatemala City</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Guyana</td>\n",
       "      <td>Georgetown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Kuwait</td>\n",
       "      <td>Kuwait City</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Luxembourg</td>\n",
       "      <td>Luxembourg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Maldives</td>\n",
       "      <td>Malé</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Marshall Islands</td>\n",
       "      <td>Majuro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Mexico</td>\n",
       "      <td>Mexico City</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Monaco</td>\n",
       "      <td>Monaco-Ville</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Mozambique</td>\n",
       "      <td>Maputo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Niger</td>\n",
       "      <td>Niamey</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Panama</td>\n",
       "      <td>Panama City</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Papua New Guinea</td>\n",
       "      <td>Port Moresby</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>San Marino</td>\n",
       "      <td>San Marino</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Sao Tomé and Príncipe</td>\n",
       "      <td>São Tomé</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Singapore</td>\n",
       "      <td>Singapore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>South Korea</td>\n",
       "      <td>Seoul</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Sri Lanka</td>\n",
       "      <td>Sri Jayawardenepura Kotte</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>Stockholm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>Taiwan</td>\n",
       "      <td>Taipei</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Tunisia</td>\n",
       "      <td>Tunis</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     name                    capital\n",
       "0                 Algeria                    Algiers\n",
       "1                 Andorra           Andorra la Vella\n",
       "2                Barbados                 Bridgetown\n",
       "3                  Belize                   Belmopan\n",
       "4                  Brazil                   Brasília\n",
       "5                  Brunei        Bandar Seri Begawan\n",
       "6                 Burundi                  Bujumbura\n",
       "7                Djibouti                   Djibouti\n",
       "8               Guatemala             Guatemala City\n",
       "9                  Guyana                 Georgetown\n",
       "10                 Kuwait                Kuwait City\n",
       "11             Luxembourg                 Luxembourg\n",
       "12               Maldives                       Malé\n",
       "13       Marshall Islands                     Majuro\n",
       "14                 Mexico                Mexico City\n",
       "15                 Monaco               Monaco-Ville\n",
       "16             Mozambique                     Maputo\n",
       "17                  Niger                     Niamey\n",
       "18                 Panama                Panama City\n",
       "19       Papua New Guinea               Port Moresby\n",
       "20             San Marino                 San Marino\n",
       "21  Sao Tomé and Príncipe                   São Tomé\n",
       "22              Singapore                  Singapore\n",
       "23            South Korea                      Seoul\n",
       "24              Sri Lanka  Sri Jayawardenepura Kotte\n",
       "25                 Sweden                  Stockholm\n",
       "26                 Taiwan                     Taipei\n",
       "27                Tunisia                      Tunis"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import substring\n",
    "(world.filter(substring(world['name'], 1, 1)==substring(world['capital'], 1, 1))\n",
    "    .select('name', 'capital')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13. All the vowels\n",
    "\n",
    "**Equatorial Guinea** and **Dominican Republic** have all of the vowels (a e i o u) in the name. They don't count because they have more than one word in the name.\n",
    "\n",
    "**Find the country that has all the vowels and no spaces in its name.**\n",
    "\n",
    "- You can use the phrase name `NOT LIKE '%a%'` to exclude characters from your results.\n",
    "- The query shown misses countries like Bahamas and Belarus because they contain at least one 'a'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Mozambique</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name\n",
       "0  Mozambique"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(world.filter((world['name'].rlike('[Aa]')) &\n",
    "               world['name'].rlike('[Ee]') &\n",
    "               world['name'].rlike('[Ii]') &\n",
    "               world['name'].rlike('[Oo]') &\n",
    "               world['name'].rlike('[Uu]') &\n",
    "               world['name'].rlike(r'^\\S+$'))\n",
    "    .select('name')\n",
    "    .toPandas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
